# -*- coding = utf-8 -*-
# @Time: 2022/8/6 16:40
# @Author: Tanya
# @File: data_clean.py
# @Software: PyCharm
# @Summary:

import pandas as pd
import numpy as np

# 读取数据
data = pd.read_csv('淘宝销售数据集.csv')
# print(data.head())
# 查看重复值数量
# print(data.duplicated().sum())
# 删除重复值
data.drop_duplicates(inplace=True)
# 查看缺失值
# print(data.isnull().sum())
# 缺失值均为地理信息，对用户行为分析无影响不做处理
data.drop('user_geohash', axis=1, inplace=True)

# 重置索引
data = data.reset_index(drop=True)
# 将time列拆分为date列和hour列，用做不同时间维度的分析
data['time'] = pd.to_datetime(data['time'])
data['date'] = data['time'].dt.date
data['date'] = pd.to_datetime(data['date'])
data['hour'] = data['time'].dt.hour
# print(data.head())
# print(data.info())
# 将item_id和item_category转换成str
data['item_id'] = data['item_id'].astype(str)
data['item_category'] = data['item_category'].astype(str)
# 删除时间列
data.drop(labels='time', axis=1, inplace=True)
# print(data.head())
# print(data.info())
data.to_csv('taobao_cleaned.csv', mode='w', encoding='utf-8', index=0)
print('已完成数据清洗，并保存至taobao_cleaned，数据条数：', len(data))


